### Configuration
################################################################################
### R Options
nodeInfo = system(paste0("scontrol show node ", Sys.info()["nodename"]), intern = TRUE)
cpus = as.integer(unlist(strsplit(unlist(strsplit(nodeInfo[grep("CfgTRES", nodeInfo)], "cpu="))[2], ","))[1])
memInKB = as.integer(unlist(strsplit(unlist(strsplit(nodeInfo[grep("CfgTRES", nodeInfo)], "mem="))[2], "M,"))[1]) * 1024^2

options(stringsAsFactors=FALSE,
        dplyr.summarise.inform=FALSE, 
        future.globals.maxSize=min(memInKB, 20 * 1024^3),
        mc.cores=min(cpus,1),
        future.fork.enable=TRUE, future.plan="multicore",
        future.rng.onMisuse="ignore")

### Fuctions
# Git directory and files to source must be done first, then all helper functions can be sourced
git_files_to_source = c("functions_analysis.R", 
                        "functions_biomart.R", 
                        "functions_degs.R", 
                        "functions_io.R", 
                        "functions_plotting.R", 
                        "functions_util.R")
git_files_to_source = file.path(param$path_to_git, "R", git_files_to_source)
file_exists = purrr::map_lgl(git_files_to_source, file.exists)
if (any(!file_exists)) stop(paste("The following files could not be found:", paste(git_files_to_source[!file_exists], collapse=", "), ". Please check the git directory at '", param$path_to_git, "'.!"))
invisible(purrr::map(git_files_to_source, source))

### Debugging mode
# 'default_debugging' for default, 'terminal_debugger' for debugging without X11, 'print_traceback' for non-interactive sessions 
param$debugging_mode = "default_debugging"
switch (param$debugging_mode, 
        default_debugging=on_error_default_debugging(), 
        terminal_debugger=on_error_start_terminal_debugger(),
        print_traceback=on_error_just_print_traceback(),
        on_error_default_debugging())
### Rmarkdown configuration
################################################################################

### Create output directories
if (!file.exists(file.path(param$path_out, "figures"))) dir.create(file.path(param$path_out, "figures"), recursive=TRUE, showWarnings=FALSE)
if (!file.exists(file.path(param$path_out, "data"))) dir.create(file.path(param$path_out, "data"), recursive=TRUE, showWarnings=FALSE)


### R Options
options(citation_format="pandoc", 
        knitr.table.format="html",
        kableExtra_view_html=TRUE)


### Knitr default options
knitr::opts_chunk$set(echo=TRUE,                     # output code
                      cache=FALSE,                   # do not cache results
                      message=TRUE,                  # show messages
                      warning=TRUE,                  # show warnings
                      results = "hold",              # show results after running whole chunk
                      tidy=FALSE,                    # do not auto-tidy-up code
                      fig.width=10,                  # default fig width in inches
                      class.source='fold-hide',      # by default collapse code blocks
                      dev=c('png', 'svg', 'tiff'),          # create figures in png, tiff, and svg; the first device (png) will be used for HTML output
                      dev.args=list(png = list(type="cairo"),     # png: use cairo - works on cluster
                                    tiff = list(type="cairo", compression = 'zip')),  # tiff: use cairo - works on cluster, transparent background, compression type "zip" is ‘deflate (Adobe-style)’ 
                      dpi=300,                       # figure resolution
                      fig.retina=2,                 # retina multiplier
                      fig.path=paste0(file.path(param$path_out, "figures"), "/")  # Path for figures in png and pdf format (trailing "/" is needed)
)

### Set output hooks
knitr::knit_hooks$set(message=format_message, warning=format_warning)

### Required libraries
library(magrittr)

### Set the bibliographic environment
# Clear previous bibliography
knitcitations::cleanbib()

# If the DOI publication servers cannot be reached, there will be no citations, knitcitations will not write a references.bib file and pandoc will stop. This makes sure that there is always at least one citation.
bib_ref = knitcitations::read.bibtex(file.path(param$path_to_git,"assets","sc_analysis_references.bib"))
invisible(knitcitations::citep(bib_ref))

### Figure heights
# high figures
fig_large_height = 8
# Single figure, landscape
fig_standard_height = 4
# Two plots alongside (e.g. umaps)
fig_standard2_height = 5
# Three plots alongside (e.g. umaps)
fig_standard3_height = 4
# Four plots alongside (e.g. umaps)
fig_standard4_height = 2.5
# Four plots 2x2 (e.g. umaps)
fig_patchwork4_height = fig_standard2_height * 2
# Six plots 2x3 (e.g. umaps)
fig_patchwork6_height = fig_standard3_height * 2
# Eight plots 4x2 (e.g. umaps)
fig_patchwork8_height = fig_standard4_height * 2
# Twelve plots 4x3 (e.g. umaps)
fig_patchwork12_height = fig_standard4_height * 3
# Sixteen plots 4x4 (e.g. umaps)
fig_patchwork16_height = fig_standard4_height * 4
# Load renv and virtualenvs
renv::load(file.path(param$path_to_git,"env/basic"))
renv::use_python(type = "virtualenv", name = file.path(param$path_to_git,"env/basic/virtualenvs/r-reticulate"))
#reticulate::use_virtualenv(file.path(param$path_to_git,"env/basic/virtualenvs/r-reticulate"))

# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(knitr)
library(clustree)
library(ggraph)

Read data

Read gene annotation

Gene annotation including Ensembl IDs, gene symbols, Entrez Ids, and Seurat gene names, are loaded from a pre-prepared reference file or Ensembl.

## Read gene annotation
# We read gene annotation from file. 
# We generate several dictionaries to translate between Ensembl IDs, gene symbols, Entrez Ids, and Seurat gene names. 
# ATTENTION: ONLY for human and mouse!!!

### Set reference
################################################################################
if (param$species=="human") {
  param$mart_dataset = ifelse(is.null(param$mart_dataset), "hsapiens_gene_ensembl", param$mart_dataset)
  param$mt = ifelse(is.null(param$mt), "^MT-", param$mt)
  param$enrichr_dbs = if(is.null(param$enrichr_dbs)) {
    c("GO_Biological_Process_2023", "WikiPathway_2023_Human", "Azimuth_2023", "CellMarker_2024")
  }
  param$annotation_dbs = ifelse(is.null(param$annotation_dbs), "BlueprintEncodeData()", param$annotation_dbs)
} else {
  if (param$species=="mouse") {
    param$mart_dataset = ifelse(is.null(param$mart_dataset), "mmusculus_gene_ensembl", param$mart_dataset)
    param$mt = ifelse(is.null(param$mt), "^mt-", param$mt)
    param$enrichr_dbs = if(is.null(param$enrichr_dbs)) {
      c("GO_Biological_Process_2023", "WikiPathways_2019_Mouse", "Azimuth_2023", "CellMarker_2024")
    }
    param$annotation_dbs = ifelse(is.null(param$annotation_dbs), "MouseRNAseqData()", param$annotation_dbs)
  } else {
    param$mart_dataset=param$mart_dataset
  }
}

# Set defaults
# Default is Ensembl release 98 which corresponds to 2020-A reference package of 10x Genomics Cell Ranger
param$annot_version = ifelse(is.null(param$annot_version), 98, param$annot_version)
param$annot_main = if(is.null(param$annot_main)) {
  c(ensembl="ensembl_gene_id", symbol="external_gene_name", entrez="entrezgene_accession")
}
param$mart_attributes = if(is.null(param$mart_attributes)) {
  c(c(ensembl="ensembl_gene_id", symbol="external_gene_name", entrez="entrezgene_accession"), 
    c("chromosome_name", "start_position", "end_position", "percentage_gene_gc_content", "gene_biotype", "strand", "description"))
}

param$path_reference = file.path(param$path_to_git, "references", param$mart_dataset, param$annot_version)
param$reference = paste0(param$mart_dataset, ".v", param$annot_version, ".annot.txt")
param$file_annot = ifelse(is.null(param$file_annot), file.path(param$path_reference, param$reference), param$file_annot)
param$file_cc_genes = ifelse(is.null(param$file_cc_genes), file.path(param$path_reference, "cell_cycle_markers.xlsx"), param$file_cc_genes)



### Download reference if not existing
################################################################################
if (!file.exists(param$file_annot) | !file.exists(param$file_cc_genes)) {
  param$file_annot = file.path(param$path_reference, param$reference)
  param$file_cc_genes = file.path(param$path_reference, "cell_cycle_markers.xlsx")
  source(file.path(param$path_to_git, "modules/download_references/download_references.R"))
}


### read_ensembl_annotation
################################################################################

# Load from file
annot_ensembl = read.delim(param$file_annot)

# Double-check if we got all required annotation, in case annotation file was read
check_annot_main = all(param$annot_main %in% colnames(annot_ensembl))
if (!check_annot_main) {
  stop("The annotation table misses at least one of the following columns: ", paste(param$annot_main, collapse=", "))
}

# Translate IDs
IDs_out = suppressWarnings(TranslateIDs(annot_ensembl, param$annot_main)) 
ensembl_to_seurat_rowname = IDs_out[[1]]
seurat_rowname_to_ensembl = IDs_out[[2]]
seurat_rowname_to_entrez = IDs_out[[3]]
annot_ensembl = IDs_out[[4]]


### read_cc_genes
################################################################################
# Use biomart to translate human cell cycle genes to the species of interest

# Load from file
genes_s = openxlsx::read.xlsx(param$file_cc_genes, sheet=1)
genes_g2m = openxlsx::read.xlsx(param$file_cc_genes, sheet=2)

# Convert Ensembl ID to Seurat-compatible unique rowname
genes_s = data.frame(Human_gene_name=genes_s$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_s$Species_ensembl_id]))
genes_g2m = data.frame(Human_gene_name=genes_g2m$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_g2m$Species_ensembl_id]))

Read scRNA-seq data

The workflow is appicable to single cell and nuclei RNAseq data pre-processed via 10x Genomics or SmartSeq-2 or for other data that are represented by a simple table with transcript counts per gene and cell. In the first step, a Seurat object of the data is generated and subsequently some metadata are added. Similarly, a Seurat object can be loaded to inspect the stored scRNA seq data.

Pre-processing of 10x data with Cell Ranger We use the 10x Genomics Cell Ranger software to map 10x sequencing reads. The result is a count matrix that contains the number of unique transcripts per gene (rows) and cell (columns). To save storage space, the matrix is converted into a condensed format and described by the following 3 files:
  • “features.tsv.gz”: Tabular file with gene annotation (Ensembl gene ID and the gene symbol) to identify genes
  • “barcodes.tsv.gz”: File with cell barcodes to identify cells
  • “matrix.mtx.gz”: A condensed version of the count matrix
Pre-processing of SmartSeq-2 data Sequencing reads from SmartSeq-2 (and other) experiments can be mapped with any suitable mapping software as long as a count matrix is generated that contains the number of mapped reads per gene (rows) and cell (columns). The first row of the count matrix should contain cell barcodes or names, the first column of the matrix should contain Ensembl gene IDs.
What is Seurat and which information is contained in a Seurat object? Seurat is an R-package that is used for the analysis of single-cell RNA-seq data. We read pre-processed data as described above, and convert it to a count matrix in R. In the case of 10x data, the count matrix contains the number of unique RNA molecules (UMIs) per gene and cell. In the case of SmartSeq-2 data, the count matrix contains the number of reads per gene and cell. Seurat v5 assays store data in layers. These layers can store raw, un-normalized counts (layer=‘counts’), normalized data (layer=‘data’), or z-scored/variance-stabilized (scaled) data (layer=‘scale.data’).
In addition, the workflow stores additional information in the Seurat object, including but not limited to dimensionality reduction and cluster results.

Here, for the project Testdata, the following data are analysed:

### Read rds object
################################################################################

### Load Seurat S4 objects 
# Test if file is defined
if (is.null(param$data)) {
  message("Dataset is not specified")
} else {
  # Test if file exists
  if (file.exists(param$data)) {
    # Read object
    message(paste0("Load dataset:", param$data))
    sc = base::readRDS(param$data)
    
    # Transfer original params to loaded object
    if ("parameters" %in% list_names(sc@misc[])) {
      # Retrieve previous parameter settings
      orig_param = sc@misc$parameters
      if ("SCT" %in% names(sc@assays)) {
        if ("scale.data" %in% Layers(sc[["SCT"]])) {
          orig_param$norm = "SCT"
        } else {
          orig_param$norm = "RNA"
        }
      } else {
        orig_param$norm = "RNA"
      }
      
      # Keep some parameter settings from object and project defined
      orig_param_keep = orig_param[c("annot_version", "species")]
      basic_param_keep = param[c("path_to_git", "scriptname", "author", "project_id", "data", "path_out", "file_annot", "file_cc_genes")]
      # Test for reference concordance
      rerun_read_gene_annotation = (orig_param$species == param$species & orig_param$annot_version == param$annot_version)
      # Integrate parameter
      param = modifyList(x = param, val = orig_param)
      param = modifyList(x = param, val = basic_param_keep)
      param = modifyList(x = param, val = param_advset)
      param = modifyList(x = param, val = orig_param_keep)
    }
    
    # Rerun read_gene_annotation.R if not fitting to loaded object
    if (!rerun_read_gene_annotation == TRUE) {
      source(file.path(param$path_to_git,'scripts/read_data/read_gene_annotation.R'))
    }
    
    # Confirm correct setting of existing normalization method
    if (param$norm %in% names(sc@assays)) {
      if (param$norm == "RNA") {
        if ("scale.data" %in% Layers(sc[["RNA"]])) {
            param$norm = "RNA"
        } else {
          param$norm = "SCT"
        }
      } else if (param$norm == "SCT") {
        if ("scale.data" %in% Layers(sc[["SCT"]])) {
          param$norm = "SCT"
        } else {
          param$norm = "RNA"
        }
      } else {
        param$norm = orig_param$norm
      }
    } 

    
    ### Set colors
    # Set sample colors based on orig.ident
    if (!is.null(sc@meta.data[["orig.ident"]])) {
      if (length(unique(sc@meta.data[["orig.ident"]]))!=length(param$col_samples)) {
        message(paste0("No or wrong number of distinct colors for samples provieded. Generate colors using color palette", param$col_palette_samples))
        sample_colours_out = suppressWarnings(SetSampleColours(sc, param$col_palette_samples)) 
        sc = sample_colours_out[[1]]
        param$col_samples = sample_colours_out[[2]]
      }
    }
 
    # Set colors for clusters based on seurat_clusters 
    if (!is.null(sc@meta.data[["seurat_clusters"]])) {
      if (length(unique(sc@meta.data[["seurat_clusters"]]))!=length(param$col_clusters)) {
        message(paste0("No or wrong number of distinct colors for clusters provieded. Generate colors using color palette", param$col_palette_clusters))
        cluster_colours_out = suppressWarnings(SetClusterColours(sc, param$col_palette_clusters)) 
        sc = cluster_colours_out[[1]]
        param$col_clusters = cluster_colours_out[[2]]
      }
    }

    # Set colors for cell types based on annotation 
    if (!is.null(sc@meta.data[["annotation"]])) {
      if (length(unique(sc@meta.data[["annotation"]]))!=length(param$col_annotation)) {
        message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
        annotation_colours_out = suppressWarnings(SetAnnotationColours(sc, param$col_palette_annotation)) 
        sc = annotation_colours_out[[1]]
        param$col_annotation = annotation_colours_out[[2]]
      }
    }

  sc
  } else {
  message("Dataset does not exist")
  }
}


### Load reference Seurat S4 objects if specified
# Test if file is defined
if (!is.null(param$refdata)) {
  # Test if file exists
  if (file.exists(param$refdata)) {
   # Read object
    message(paste0("Load dataset:", param$refdata)) 
    scR = base::readRDS(param$refdata)
    
    # Transfer original params to loaded object
    if ("parameters" %in% list_names(scR@misc[])) {
      orig_paramR = scR@misc$parameters
      
      if (!is.null(orig_paramR$col_samples)) {
        param$col_samples_ref = orig_paramR$col_samples
      }
      if (!is.null(orig_paramR$col_clusters)) {
        param$col_clusters_ref = orig_paramR$col_clusters
      }
      if (!is.null(orig_paramR$col_annotation)) {
        param$col_annotation_ref = orig_paramR$col_annotation
      }
      param = modifyList(x = param, val = param_advset)
    }
    
    ### Set colors
    # Set sample colors based on orig.ident
    if (!is.null(scR@meta.data[["orig.ident"]])) {
      if (length(unique(scR@meta.data[["orig.ident"]]))!=length(param$col_samples_ref)) {
        message(paste0("No or wrong number of distinct colors for samples provieded. Generate colors using color palette", param$col_palette_samples))
        sample_colours_out = suppressWarnings(SetSampleColours(scR, param$col_palette_samples)) 
        scR = sample_colours_out[[1]]
        param$col_samples_ref = sample_colours_out[[2]]
      }
    }
    
    # Set colors for clusters based on seurat_clusters 
    if (!is.null(scR@meta.data[["seurat_clusters"]])) {
      if (length(unique(scR@meta.data[["seurat_clusters"]]))!=length(param$col_clusters_ref)) {
        message(paste0("No or wrong number of distinct colors for clusters provieded. Generate colors using color palette", param$col_palette_clusters))
        cluster_colours_out = suppressWarnings(SetClusterColours(scR, param$col_palette_clusters)) 
        scR = cluster_colours_out[[1]]
        param$col_clusters_ref = cluster_colours_out[[2]]
      }
    }
    
    # Set colors for cell types based on annotation 
    if (!is.null(scR@meta.data[["annotation"]])) {
      if (length(unique(scR@meta.data[["annotation"]]))!=length(param$col_annotation_ref)) {
        message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
        annotation_colours_out = suppressWarnings(SetAnnotationColours(scR, param$col_palette_annotation)) 
        scR = annotation_colours_out[[1]]
        param$col_annotation_ref = annotation_colours_out[[2]]
      }
    }
    
  scR
  } else {
  message("Reference dataset does not exist")
  }
}  
# Transfer object into a list
sc_original = sc
sc = list()
n = param$project_id
sc[[n]] = sc_original
# Download test dataset 
param$path_test_dataset=paste0(param$path_to_git, "/scripts/download_test_datasets/", param$download_test_datasets, ".R")
if (file.exists(param$path_test_dataset)) {
  message(paste0("Using test dataset '", gsub('download_','', param$download_test_datasets), "'."))
  # Data output in a data subfolder of the directory where it is run 
  # Create output directories
  if (!file.exists("data")) dir.create("data", recursive=TRUE, showWarnings=FALSE)
  setwd(file.path(param$path_to_git,"data"))
  source(param$path_test_dataset)
  param$path_data = data.frame(name=list.dirs(path = file.path(param$path_to_git,"data", "counts"),
                                              full.names = FALSE, recursive = FALSE), 
                                type=c("10x"),
                                path=list.dirs(path = file.path(param$path_to_git,"data", "counts"),
                                              full.names = TRUE, recursive = FALSE))
} else {
  message("Test dataset does not exist.")
}
# List of Seurat objects
sc = list()

datasets = param$path_data
for (i in seq(nrow(datasets))) {
  name = datasets[i, "name"]
  type = datasets[i, "type"]
  path = datasets[i, "path"]
  suffix = datasets[i, "suffix"]
  
  # Read 10X or smartseq2
  if (type == "10x") {
    # Read 10X sparse matrix into a Seurat object
    sc = c(sc, ReadSparseMatrix(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, cellnames_suffix=suffix))
    
  } else if (type == "smartseq2") {
    # Read counts table into a Seurat object
    sc = c(sc, ReadCountsTable(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, parse_plate_information=TRUE, return_samples_as_datasets=TRUE, cellnames_suffix=suffix))
  } 
}

# Make sure that sample names are unique. If not, just prefix with the dataset name. Also set orig.ident to this name.
sample_names = names(sc)
duplicated_sample_names_idx = which(sample_names %in% sample_names[duplicated(sample_names)])
for (i in duplicated_sample_names_idx) {
  sample_names[i] = paste(head(sc[[i]][["orig.dataset", drop=TRUE]], 1), sample_names[i], sep=".")
  sc[[i]][["orig.ident"]] = sample_names[i]
}

# Set up colors for samples and add them to the sc objects
sample_names = purrr::flatten_chr(purrr::map(sc, function(s) {
  nms = unique(as.character(s[[]][["orig.ident"]]))
  return(nms) 
}))
param$col_samples = GenerateColours(num_colours=length(sample_names), names=sample_names, palette=param$col_palette_samples, alphas=1)
sc = purrr::map(sc, ScAddLists, lists=list(orig.ident=param$col_samples), lists_slot="colour_lists")

message("Read scRNA-seq data into Seurat object")
sc
## $Sample1
## An object of class Seurat 
## 8000 features across 500 samples within 1 assay 
## Active assay: RNA (8000 features, 0 variable features)
##  1 layer present: counts
## 
## $Sample2
## An object of class Seurat 
## 8000 features across 700 samples within 1 assay 
## Active assay: RNA (8000 features, 0 variable features)
##  1 layer present: counts
### Calculate percentages of mitochondrial and ribosomal genes as well as ERCC (qc_calculate_cells)
# Calculate percentage of counts in mitochondrial genes for each Seurat object
sc = purrr::map(sc, function(s) {
  if (!("percent_mt" %in% colnames(s@meta.data))) {
    mt_features = grep(pattern=param$mt, rownames(s), value=TRUE)
    return(Seurat::PercentageFeatureSet(s, features=mt_features, col.name="percent_mt", assay=param$assay_raw))
  } else {
    return(s)
  }
})

# Calculate percentage of counts in ribosomal genes for each Seurat object
sc = purrr::map(sc, function(s) {
  if (!("percent_ribo" %in% colnames(s@meta.data))) {
    ribo_features = grep(pattern="^RP[SL]", rownames(s), value=TRUE, ignore.case=TRUE)
    return(Seurat::PercentageFeatureSet(s, features=ribo_features, col.name="percent_ribo", assay=param$assay_raw))
  } else {
    return(s)
  }
})

# Calculate percentage of counts in ERCC for each Seurat object (if assay is available)
sc = purrr::map(sc, function(s) {
  if ("ERCC" %in% Seurat::Assays(s)) s$percent_ercc = s$nCount_ERCC/(s$nCount_ERCC + s$nCount_RNA)*100
  return(s)
})

# Combine cell metadata of the Seurat objects into one big metadata
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s) return(s[[]])) %>% as.data.frame())

# Names of all available QC metrics
cell_qc_features = c(paste0(c("nFeature_", "nCount_"), param$assay_raw), "percent_mt")
if ("percent_ercc" %in% colnames(sc_cell_metadata)) cell_qc_features = c(cell_qc_features, "percent_ercc")
cell_qc_features = values_to_names(cell_qc_features)




### Expand filter thresholds to all samples (qc_criteria_create)
# If filters were specified globally (i.e. not by sample), this chunk will copy them for each sample such that downstream filtering can work by sample
param$cell_filter = purrr::map(list_names(sc), function(s) {
  if (s %in% names(param$cell_filter)) {
    return(param$cell_filter[[s]])
  } else {
    return(param$cell_filter)
  }
})

param$feature_filter = purrr::map(list_names(sc), function(s) {
  if (s %in% names(param$feature_filter)) {
    return(param$feature_filter[[s]])
  } else {
    return(param$feature_filter)
  }
})

# List of all filter thresholds per QC metrics
cell_qc_thresholds = purrr::map(cell_qc_features, function(m) {
  tresh = purrr::map_dfr(names(param$cell_filter), function(n) {
    if (m %in% names(param$cell_filter[[n]])) {
      t = data.frame(orig.ident=n, min=param$cell_filter[[n]][[m]][1], max=param$cell_filter[[n]][[m]][2]) %>% 
        tidyr::pivot_longer(cols=2:3, names_to=c("threshold")) %>%
        dplyr::filter(!is.na(value))
      t$threshold = factor(t$threshold, levels=c("min", "max"))
      return(t)
    }
  })
})



### Calculate diverse feature caracteristics (qc_calculate_features)
# Only RNA assay at the moment
# counts_median uses sapply on the counts matrix, which converts the sparse matrix into a normal matrix
#   This might have to be adapted in future (Sparse Matrix Apply Function)
sc = purrr::map(list_names(sc), function(n) {
  # Calculate percentage of counts per gene in a cell
  counts_rna = Seurat::GetAssayData(sc[[n]], layer="counts", assay=param$assay_raw)
  total_counts = sc[[n]][[paste0("nCount_", param$assay_raw), drop=TRUE]]
  counts_rna_perc = Matrix::t(Matrix::t(counts_rna)/total_counts)*100
  
  # Calculate feature filters
  num_cells_expr = Matrix::rowSums(counts_rna >= 1)
  num_cells_expr_threshold = Matrix::rowSums(counts_rna >= param$feature_filter[[n]][["min_counts"]])
  
  # Calculate median of counts_rna_perc per gene 
  counts_median = apply(counts_rna_perc, 1, median)
  
  # Add all QC measures as metadata
  sc[[n]][[param$assay_raw]] = Seurat::AddMetaData(sc[[n]][[param$assay_raw]], data.frame(num_cells_expr, num_cells_expr_threshold, counts_median))
  return(sc[[n]])
})


Pre-processing

The steps below represent a standard processing workflow for single-cell RNA-seq data, including quality control, the respective filtering of cells and genes, data normalization and scaling, the detection of highly variable genes, and dimensional reduction.
Why is pre-processing so important? Cells may have been damaged during sample preparation and might be only partially captured in the sequencing. In addition, free-floating mRNA from damaged cells can be encapsulated, adding to the background noise. The low-quality cells and free-floating mRNA interfere with downstream analyses. Therefore, cells and genes are filtered based on defined quality metrics. Data normalization eliminates cell-specific biases such as the absolute number of reads per cell, allowing us to systematically compare cells afterwards. Subsequent scaling corrects for the fact that genes have different lengths, allowing us to compare genes afterwards. Lastly, highly variable genes are determined, reducing computational overhead and noise from genes that are not interesting.


Quality control

Quality control (QC) is an important step of the pre-processing workflow. We start the analysis by removing unwanted cells from the dataset(s).
These commonly used QC metrics, namely the number of unique genes detected in each cell (“nFeature_”), the total number of molecules detected in each cell (“nCount_”), and the percentage of counts that map to the mitochrondrial genome (“percent_mt”), can be used to infer filter thresholds. If ERCC spike-in controls were used, the percentage of counts mapping to them is also shown (“percent_ercc”).

Which cells can be considered to be of low quality?

In cell QC these covariates are filtered via thresholding as they are a good indicator of cell viability. However, it is crucial to consider the three QC covariates jointly as otherwise it might lead to misinterpretation of cellular signals.

  • Cells with very high number of genes should possibly be excluded as those might be duplicates. Since the total number of reads detected within a cell typically strongly correlates with the number of unique genes, we can focus on the number of unique genes for filtering.
    • Cells with very low count depth might be droplets with ambient RNA. If the dataset contains a high number of such events, it is advisable to perform additionally upstream correction for ambient RNA e.g. via SoupX (Young and Behjati (2020)) as ambient RNA can confound the number of observed counts and adds background noise to the data.
    • Low-quality cells such as dying, degraded and damaged cells will also often have very few genes (“nFeature_”) and low total counts (“nCount_”). In addition, damaged cells often exhibit high mitochondrial (“percent_mt”) or spike-in (“percent_ercc”) content. As mitochondria have their own membranes, their RNA is often the last to degrade in damaged cells and can thus be found in high numbers. However, it is important to keep in mind that different cell types and cells isolated from various species may differ in their number of expressed genes and metabolism. For example, stem cells may express a higher number of unique genes, and metabolically active cells may express a higher number of mitochondrial genes. Hence, it is crucial to consider the three QC covariates jointly as otherwise it might lead to misinterpretation of cellular signals. E.g. a cell with a high fraction of mitochondrial counts might be a dying cell with a broken membrane in combination with a low number of counts, but in combination with a intermediate to high count depth it might be a cell involved in respiratory processes.
Impact of low-quality cells on downstream analyses First of all, low-quality cells of different cell types might cluster together due to similarities in their damage-induced gene expression profiles. This leads to artificial intermediate states or trajectories between subpopulations that would otherwise be distinct. Second, low-quality cells might mask relevant gene expression changes. The main differences captured by the first principal components will likely be based on cell quality rather than the underlying biology, reducing the power of dimensionality reduction. Likewise, highly variable genes might just be genes that differ between low- and high-quality cells. Lastly and equally important, the low number of total counts in low-quality cells might lead to artificial upregulation of genes due to wrong scaling effects during the normalisation process.


Filtering

Cells and genes are filtered based on the following thresholds:

cell_filter_lst = param$cell_filter %>% unlist(recursive=FALSE)
is_numeric_filter = purrr::map_lgl(cell_filter_lst, function(f) return(is.numeric(f) & length(f)==2))

# numeric cell filters
if (length(cell_filter_lst[is_numeric_filter]) > 0) {
  purrr::exec("rbind", !!!cell_filter_lst[is_numeric_filter]) %>%
    knitr::kable(align="l", caption="Numeric filters applied to cells", col.names=c("Min", "Max")) %>% 
    kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}

# categorial cell filters
if (length(cell_filter_lst[!is_numeric_filter]) > 0) {
purrr::exec("rbind", !!!cell_filter_lst[!is_numeric_filter] %>% purrr::map(paste, collapse=",")) %>%
  knitr::kable(align="l", caption="Categorial filters applied to cells", col.names=c("Values")) %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}
  
# gene filters
feature_filter_lst = param$feature_filter %>% unlist(recursive=FALSE)
if (length(feature_filter_lst) > 0) {
  purrr::exec("rbind", !!!feature_filter_lst) %>% 
    knitr::kable(align="l", caption="Filters applied to genes", col.names=c("Value")) %>%
    kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}
Numeric filters applied to cells
Min Max
Sample1.nFeature_RNA 20 NA
Sample1.nCount_RNA 200 20000
Sample1.percent_mt 0 18
Sample2.nFeature_RNA 20 NA
Sample2.nCount_RNA 200 20000
Sample2.percent_mt 0 18
Filters applied to genes
Value
Sample1.min_counts 1
Sample1.min_cells 5
Sample2.min_counts 1
Sample2.min_cells 5

The number of excluded cells and features is as follows:

# Iterate over datasets and filters
# Record a cell if it does not pass the filter
# Also record a cell if it belongs to a sample that should be dropped
sc_cells_to_exclude  = purrr::map(list_names(sc), function(n) { 
  filter_result = purrr::map(list_names(param$cell_filter[[n]]), function(f) {
    filter = param$cell_filter[[n]][[f]]
    if (is.numeric(filter)) {
      if (is.na(filter[1])) filter[1] = -Inf # Minimum
      if (is.na(filter[2])) filter[2] = Inf  # Maximum 
      idx_exclude = sc[[n]][[f, drop=TRUE]] < filter[1] | sc[[n]][[f, drop=TRUE]] > filter[2]
      return(names(which(idx_exclude)))
    } else if (is.character(filter)) { 
      idx_exclude = !sc[[n]][[f, drop=TRUE]] %in% filter
      return(Cells(sc[[n]])[idx_exclude])
    }
  })

  # Samples to drop
  if (n %in% param$samples_to_drop) {
    filter_result[["samples_to_drop"]] = colnames(sc[[n]])
  } else {
    filter_result[["samples_to_drop"]] = as.character(c())
  }
  
  # Minimum number of cells for a sample to keep
  if (ncol(sc[[n]]) < param$samples_min_cells) {
    filter_result[["samples_min_cells"]] = colnames(sc[[n]])
  } else {
    filter_result[["samples_min_cells"]] = as.character(c())
  }
  
  return(filter_result)
})

# Summarise
sc_cells_to_exclude_summary = purrr::map_dfr(sc_cells_to_exclude, function(s) {
  return(as.data.frame(purrr::map(s, length))) 
  })
rownames(sc_cells_to_exclude_summary) = names(sc_cells_to_exclude)

sc_cells_to_exclude_summary$Original = purrr::map_int(sc, ncol)
sc_cells_to_exclude_summary$Excluded = purrr::map_int(sc_cells_to_exclude, function(s) { return(purrr::flatten(s) %>% unique() %>% length())})
sc_cells_to_exclude_summary$PercKept = round((sc_cells_to_exclude_summary$Original - sc_cells_to_exclude_summary$Excluded) / sc_cells_to_exclude_summary$Original * 100, 2)
knitr::kable(sc_cells_to_exclude_summary, 
             align="l", 
             caption="Summary of excluded cells") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))

# Add filter column to sc_cell_metadata for post-filtering QC
sc_cell_metadata$IS_FILTERED = rownames(sc_cell_metadata) %in% unlist(sc_cells_to_exclude)

# Now filter, drop the respective colours and adjust integration method
sc = purrr::map(list_names(sc), function(n) {
  cells_to_keep = Cells(sc[[n]])
  cells_to_keep = cells_to_keep[!cells_to_keep %in% purrr::flatten_chr(sc_cells_to_exclude[[n]])]
  if (length(cells_to_keep)==0) return(NULL)
  else return(subset(sc[[n]], cells=cells_to_keep))
}) %>% purrr::discard(is.null)

if (length(sc)==1) param$integrate_samples[["method"]] = "single"
Summary of excluded cells
nFeature_RNA nCount_RNA percent_mt samples_to_drop samples_min_cells Original Excluded PercKept
Sample1 9 2 53 0 0 500 55 89.00
Sample2 22 3 65 0 0 700 67 90.43
# Only RNA assay at the moment

# Iterate over samples and record a feature if it does not pass the filter
sc_features_to_exclude = purrr::map(list_names(sc), function(n) {
  
  # Make sure the sample contains more cells than the minimum threshold
  if (length(Cells(sc[[n]])) < param$feature_filter[[n]][["min_cells"]]) return(list())
  
  # Return gene names that do not pass the minimum threshold 
  else return(names(which(sc[[n]][[param$assay_raw]][["num_cells_expr_threshold", drop=TRUE]] < param$feature_filter[[n]][["min_cells"]])))
})

# Which genes are to be filtered for all samples?
# Note: Make sure that no other sample is called "AllSamples"
sc_features_to_exclude$AllSamples = Reduce(f=intersect, x=sc_features_to_exclude)

# Summarise
sc_features_to_exclude_summary = purrr::map_dfr(names(sc), function(n){
  df = data.frame(Original=nrow(sc[[n]]), 
                  FailThreshold=length(sc_features_to_exclude[[n]]))
  df$PercFailThreshold = round(df$FailThreshold / df$Original * 100, 2)
  df$Kept = length(setdiff(rownames(sc[[n]]), sc_features_to_exclude[["AllSamples"]]))
  df$PercKept = round(df$Kept / df$Original * 100, 2)
  return(df)
})
rownames(sc_features_to_exclude_summary) = names(sc)
knitr::kable(sc_features_to_exclude_summary, align="l", caption="Summary of excluded genes") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))

# Now filter those genes that are to be filtered for all samples 
sc = purrr::map(list_names(sc), function(n) {
  assay_names = Seurat::Assays(sc[[n]])
  features_to_keep = purrr::map(values_to_names(assay_names), function(a) {
    features = rownames(sc[[n]][[a]])
    keep = features[!features %in% sc_features_to_exclude$AllSamples]
    return(keep)
  })
  return(subset(sc[[n]], features=purrr::flatten_chr(features_to_keep)))
})
Summary of excluded genes
Original FailThreshold PercFailThreshold Kept PercKept
Sample1 8000 1915 23.94 6439 80.49
Sample2 8000 1686 21.08 6439 80.49

After filtering, the size of the Seurat object is:

sc
## $Sample1
## An object of class Seurat 
## 6439 features across 445 samples within 1 assay 
## Active assay: RNA (6439 features, 0 variable features)
##  1 layer present: counts
## 
## $Sample2
## An object of class Seurat 
## 6439 features across 633 samples within 1 assay 
## Active assay: RNA (6439 features, 0 variable features)
##  1 layer present: counts


QC covariates

# Create plot per QC metric
p_list = list()
for (m in cell_qc_features) {
  p_list[[m]]= ggplot(sc_cell_metadata[, c("orig.ident", m, "IS_FILTERED")] %>% dplyr::filter(!IS_FILTERED), aes(x=.data[["orig.ident"]], y=.data[[m]], fill=.data[["orig.ident"]], group=.data[["orig.ident"]])) +
    geom_violin(scale="width")

  # Adds points for samples with less than three cells since geom_violin does not work here
  p_list[[m]] = p_list[[m]] + 
    geom_point(data=sc_cell_metadata[, c("orig.ident", m, "IS_FILTERED")] %>% dplyr::filter(!IS_FILTERED, orig.ident %in% names(which(table(sc_cell_metadata$orig.ident) < 3))), aes(x=.data[["orig.ident"]], y=.data[[m]], fill=.data[["orig.ident"]]), shape=21, size=2)
  
  # Now add style
  p_list[[m]] = p_list[[m]] + 
    AddStyle(title=m, legend_position="none", fill=param$col_samples, xlab="") + 
    theme(axis.text.x=element_text(angle=45, hjust=1))
  
  # Add filter threshold as segments to plot; min threshold lines are dashed and max threshold lines are twodashed
  if (nrow(cell_qc_thresholds[[m]]) > 0) {
    sample_names = sc_cell_metadata[, c("orig.ident", m, "IS_FILTERED")] %>% dplyr::filter(!IS_FILTERED) %>% dplyr::pull(orig.ident) %>% unique()
    p_list[[m]] = p_list[[m]] + geom_segment(data=cell_qc_thresholds[[m]] %>% dplyr::filter(orig.ident %in% sample_names), 
                                             aes(x=as.integer(as.factor(orig.ident))-0.5, 
                                                 xend=as.integer(as.factor(orig.ident))+0.5, 
                                                 y=value, yend=value, lty=threshold), colour="firebrick") +
      scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min")))
  }
}
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Distribution of feature values") 
p

# Correlate QC metrics for cells
p_list = list()
sc_cell_metadata_plot_order = sample(1:nrow(sc_cell_metadata))

# nFeature vs nCount
m = paste0(c("nCount_", "nFeature_"), param$assay_raw)
p_list[[1]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=.data[[m[1]]], y=.data[[m[2]]], colour=.data[["orig.ident"]], alpha=.data[["IS_FILTERED"]])) +
  geom_point(size = param$pt_size) + 
  scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
  scale_alpha_manual(values=setNames(c(1, 0.1), c(FALSE, TRUE))) +
  AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
  p_list[[1]] = p_list[[1]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
    
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
  p_list[[1]] = p_list[[1]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
  

# nFeature vs percent_mt
m = c("percent_mt", paste0(c("nFeature_"), param$assay_raw))
p_list[[2]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=.data[[m[1]]], y=.data[[m[2]]], colour=.data[["orig.ident"]], alpha=.data[["IS_FILTERED"]])) +
  geom_point(size = param$pt_size) +
  scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
  scale_alpha_manual(values=setNames(c(1, 0.1), c(FALSE, TRUE))) +
  AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
  p_list[[2]] = p_list[[2]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
  p_list[[2]] = p_list[[2]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}

# nFeature vs percent_ercc (if available)
if ("percent_ercc" %in% names(cell_qc_features)) {
  m = c("percent_ercc", paste0(c("nFeature_"), param$assay_raw))
  p_list[[3]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=.data[[m[1]]], y=.data[[m[2]]], colour=.data[["orig.ident"]], alpha=.data[["IS_FILTERED"]])) +
    geom_point(size = param$pt_size) + 
    scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) + 
    scale_alpha_manual(values=setNames(c(1, 0.1), c(FALSE, TRUE))) +
    AddStyle(col=param$col_samples)
  if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
    p_list[[3]] = p_list[[3]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
  }
  if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
    p_list[[3]] = p_list[[3]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
  }
}

# Combine plots
p = patchwork::wrap_plots(p_list, ncol=length(p_list)) + patchwork::plot_annotation("Features plotted against each other")
if (length(p_list) == 1) {
  p = p & theme(legend.position="bottom")
} else {
  p = p + patchwork::plot_layout(guides="collect") & theme(legend.position="bottom")
}
p

# downsample_cells_n overwrites downsample_cells_equally
if (!is.null(param$downsample_cells_n)) {
  n = param$downsample_cells_n
} else if (param$downsample_cells_equally) {
  n = purrr::map_int(sc, ncol) %>% min()  
}

# Actual downsampling
if (!is.null(param$downsample_cells_n) | param$downsample_cells_equally) {
  sc = purrr::map(sc, function(s) {
    cells = ScSampleCells(sc=s, n=n, seed=1)
    return(subset(s, cells=cells))
  })
  
  # Adjust combined metadata accordingly
  sc_cell_metadata = sc_cell_metadata[unlist(purrr::map(sc, Cells)), ]
  
  message(paste0("Your data has been down-sampled to ", param$downsample_cells_n, " cells." ))
  print(sc)
}
# Remove filtered cells from metadata
sc_cell_metadata = sc_cell_metadata %>% dplyr::filter(IS_FILTERED==FALSE)

# Update levels but make sure level order stays the same
idx.factors = sapply(sc_cell_metadata, is.factor) %>% which()
for (n in colnames(sc_cell_metadata[idx.factors])) {
  levels_old = sc_cell_metadata %>% dplyr::pull(n) %>% levels()
  levels_new = sc_cell_metadata %>% dplyr::pull(n) %>% as.character() %>% unique()
  sc_cell_metadata[[n]] = factor(sc_cell_metadata[[n]], levels=levels_old[levels_old %in% levels_new])
}

# Update actual colors as well, as they will appear in the plots otherwise
param$col_samples = param$col_samples[names(param$col_samples) %in% names(sc)]

In the following sections, we subsequently run a series of Seurat functions for each provided sample. We normalize, select variable genes, and combine the datasets according to the chosen methods. In the process cell cycle scores are assigned to each cell based on its normalized expression of G2/M and S phase markers and data are scaled while regressing out covariants if desired.


Normalisation

Dependent on the normalisation of your choice, we either:

  1. We perform standard log normalisation by running standard functions to select variable genes, and scale normalised gene counts.
  2. Run SCTransform, a new and more sophisticated normalisation method that replaces the previous functions (normalisation, variable genes and scaling).
What do we need normalisation for?

The number of raw sequencing reads per cell is influenced by technical differences in the capture, reverse transcription and sequencing of RNA molecules, particularly due to the difficulty of achieving consistent library preparation with minimal starting material. Thus, comparing gene expression between cells may reveal differences that are solely due to sampling effects. After low-quality cells were removed by filtering, the primary goal of normalization is to remove technical sampling effects while preserving the true biological signal. Hence, normalization of the raw counts accounts for differences in sequencing depth per cell.

The standard log normalisation, a count depth scaling, is the simplest and most commonly used normalization strategy. The underlying idea is that each cell initially contained an equal number of mRNA molecules, and differences arise due to sampling effects. For each cell, the number of reads per gene is divided by a cell-specific “size factor”, which is proportional to the total count depth of the cell. The resulting normalized data add up to 1 per cell, and is then typically multiplied by a factor of 10 (10,000 in this workflow). Finally, normalized data are log-transformed for three important reasons. First, distances between log-transformed expression data represent log fold changes. Log-transformation emphasizes contributions from genes with strong relative differences, for example a gene that is expressed at an average count of 50 in cell type A and 10 in cell type B rather than a gene that is expressed at an average count of 1100 in A and 1000 in B. Second, log-transformation mitigates the relation between mean and variance in the data. Lastly, log-transformation reduces that skewness of the data as many downstream analyses require the data to be normally distributed.
What do we need scaling for? To be able to compare normalised gene counts between genes, gene counts are further scaled to have zero mean and unit variance (z-score).
After normalization, gene expression data can be compared between cells. However, expression of individual genes still cannot be compared. Highly expresed genes can overpower the signal of other less expresed genes with equal importance (overdispersed count values). Moreover, in SmartSeq-2 data, due to different gene length, longer genes can be represented by a higher number of reads. To account for this effect, normalized data are further scaled using a z-transformation, resulting in the average expression of 0 and the variance of 1 for each gene across all cells. Note that additional unwanted sources of variations can be regressed out during the scaling process, such as cell cycle effects or the percentage of mitochondrial reads.
What is SCTransform special about? The standard log normalisation assumes that count depth influences all genes equally. However, it has been shown that the use of a single size factor will introduce different effects on highly and lowly expressed genes (Hafemeister and Satija (2019)). SCTransform is a new statistical approach for the modelling, normalization and variance stabilization of single-cell RNA-seq data. SCTransform models the UMI counts (via a regularized negative binomial model) to remove the variation due to sequencing depth and adjusts the variance based on pooling information across genes with similar abundances and automatically regresses out sequencing depth (nCounts). Hence, SCTransform can be applied to 10x (contains UMI counts) but not SmartSeq-2 data. Additional unwanted sources of variations can be regressed out during SCTransform.

While raw data is typically used for statistical tests such as finding marker genes, normalised data is mainly used for visualising gene expression values. Scaled data are mainly used to determine the structure of the dataset(s) with Principal Component Analysis, and indirectly to cluster and visualise cells in 2D space.

# Normalize data the original way
#   This is required to score cell cycle (https://github.com/satijalab/seurat/issues/1679)
if (!("data" %in% sc[[param$project_id]][["RNA"]][])) {
  source(file.path(param$path_to_git,'scripts/pre-processing/normalization.R'))
}
message(paste0("Here, the ", ifelse(param$norm=="RNA","Standard log normalization","SCTransform")," method was used and no additional sources of variance regressed out."))

× (Message)
Here, the Standard log normalization method was used and no additional sources of variance regressed out.


Variable genes

The top 3,000 variable genes are selected.
Why selecting variable genes? For downstream analysis it is beneficial to focus on genes that exhibit high cell-to-cell variation, that is they are highly expressed in some cells and lowly in others.
Experience shows that 1,000-2,000 genes with the highest cell-to-cell variation are often sufficient to describe the global structure of a single-cell dataset. For example, cell type-specific genes typically highly vary between cells. Housekeeping genes, on the other hand, are similarly expressed across cells and can be disregarded to differentiate between cells. Highly variable genes are typically the genes with a cell type specific expression profile, and are often the genes of interest in single-cell experiments. Housekeeping genes, with similar levels of expression across all cells, or genes with minor expression differences, might add random noise and mask relevant changes during downstream dimensionality reduction and clustering. We therefore aim to select a sensible set of variable genes that includes interesting biological signal and excludes noise. Here, the top 3,000 variable genes are selected and used for downstream analysis.
How are variable genes selected? To determine variable genes, we need to separate biological variability from technical variability. Technical variability arises especially for lowly expressed genes, where high variability corresponds to small absolute changes that we are not interested in.
standard log normalisation:
Here, we use the variance-stabilizing transformation (vst) method implemented in Seurat (Hafemeister and Satija (2019)). This method first models the technical variability as a relationship between mean gene expression and variance using local polynomial regression. The model is then used to calculate the expected variance based on the observed mean gene expression. The difference between the observed and expected variance is called residual variance and likely reflects biological variability.
SCTransform:
Also SCTransform returns a list of top 3,000 variable genes.


Combine datasets

Whether samples should be combined via merging or integrating is very context dependent. It is advisable first performing a dimensionaly reduction on the merged dataset and only proceed to integration if common cell types separate due to batch effect.

Why integration? If the cells cluster by covariants, such as donors, batches, or condition, integrative analysis can help to remove biological differences, match shared cell types and states across datasets, and improve clustering and downstream analyses.
The goal of integration is to find corresponding cell states across samples. Vice versa, integration relys on the presence of at least a subset of corresponding cell states that are shared across datasets. However, given the increased degrees of freedom of non‐linear data integration approaches, this approach might lead to over‐correction of data, especially when a large proportion of cells are non-overlapping across datasets, and loss of biological resolution.
How does integration work? Integration uses the shared highly variable genes from each condition and then allignes the conditions to overlay cells that are similar between samples. The integrated values are a non-linear transformation of scale.data and saved in an additional layer. Moreover, the Seurat v5 integration procedure returns a dimensional reduction (i.e. integrated.cca) of the data that captures the shared sources of variance across multiple layers and can be used for visualization and unsupervised clustering. The harmonization of LogNormalized and SCT expression values across datasets is overall similar (see https://satijalab.org/seurat/articles/seurat5_integration and https://satijalab.org/seurat/articles/integration_introduction.html).
The workflow in detail The worklows for the different normalization and sample combination methods are as followed:
  1. LogNormalize data
    1a) LogNormalize - (CCsoring) - Merge - (Rerun CCsoring) - Scale - PCA
    1b) LogNormalize - (CCsoring) - Scale - PCA - Integrate - (Rerun CCsoring) - Scale - PCA
  2. SCTansform data
    with homogene experimental groups
    2a) (LogNormalize - CCsoring - ) SCTansform - Merge - (Rerun CCsoring) - PCA
    2b) (LogNormalize - CCsoring - ) SCTansform - PCA - Integrate - (Rerun CCsoring) - PCA
    with heterogene experimental groups
    2c) (LogNormalize - CCsoring - ) SCTansform - Merge - (Rerun CCsoring) - Rerun SCTansform - PCA
    2d) (LogNormalize - CCsoring - ) SCTansform - PCA - Integrate - (Rerun CCsoring) - Rerun SCTansform - PCA

Merging LogNormalized data (1a)
LogNormalized data are merged based according to https://satijalab.org/seurat/archive/v4.3/merge. Merging will combine the Seurat objects based on the raw count matrices and also merge the normalized data matrices. Any previously scaled data matrices are erased. As the results of this layer dependent on the expression across all cells in an object, the previous values from one object before merging with another are no longer valied. Therefore, the layer is removed and the scaling needs to be re-run after merging objects.

Merging SCTransform data (2a, 2c)
The goal of SCTransform is to perform normalization within an experiment by learning the model of technical noise (within the experiment). Running SCTransform standardizes the expression of each gene in each cell relative to all the other cells in the input matrix.
2a) If you have multiple samples that are technically noisy and you want to remove batch effects that are characterized by simple shifts in mean expression, it is recommend to run SCTransform on each sample individually. However, the samples should have roughly the same celltype compositions. The merged object has then multiple SCT models and the genes in the scale.data are the intersected genes.
2c) However, scale.data calculated based on multiple different SCTransform models are not comparable. Performing SCTransform separately on very diverging samples results in loss of the baseline differences between them (similar to a gene-wise scaling before merging). Hence, if samples are biologically heterogeneous or under different treatments and, therefore, the SCTransform models of each separate sample very diverging, SCTranform should be (re-)run on merged samples. This way, SCTransform will learn one model using the median sequencing depth of the entire dataset to calculate the corrected counts.

Integration (1b, 2b, 2d)
As integration uses the shared highly variable genes, prior to integration LogNormalize amd identification of variable genes or SCTansform is required. Depending on the integration methode, dementional reduction of the data is needed additionally (e.g. for reciprocal PCA).

In most cases, a re-scoring of the cell cycle state is recommanded or at least not disadvantageous.

This workflow implements the following integration methods offered by Seurat offers:
- Anchor-based CCA integration (method=CCAIntegration)
- Anchor-based RPCA integration (method=RPCAIntegration)

Anchor-based integration can be applied to either log-normalized or SCTransform-normalized datasets and can be also performed as reference-based integration where one or a subset of the datasets are listed as a ‘reference’. Reference-based integration, reduces computational time.

Canonical Correlation Analysis (CCA)

CCA-based integration enables indentification of conserved cell types across datasets and integrative analysis when gene expression is shifted e.g. due to experimental conditions, disease states, or even species affiliation. If cell types are present in only one dataset, the cells will appear as a separate sample-specific cluster.

Integration comprises the following steps:
1. CCA, a form of PCA, identifies shared sources of variation (using the 3000 most variant genes from each sample) between the samples.
2. For each cell in one sample, nearest neighbors (MNNs) based on gene expression values are identified in the other sample(s). If the same match results from reciprical analysis the cell pair forms an anchor.
3. The similarity between anchor pairs, i.e. whether the adjacent cells also form corresponding anchor pairs, is assed and incorrect anchors are removed.
4. The datasets are integrated by transforming the cell expression values using anchors and corresponding scores.
Reciprocal PCA (RPCA) For RPCA, instead of identifying shared sources of variation using variant genes, each dataset is projected into the other’s PCA space. Afterwards, similarly to CCa, mutual neighbors and anchors are identified.
RPCA-based integration is significantly faster and more conservative, i.e. cells in different biological states are less likely to align. Hence, RPCA is suitable for integration of multiple datasets, of datasets with little overlay of cell types, or datasets originate from the same platform. The strength of alignment can be elevated by increasing the k.anchor parameter (default 5).
if (length(sc) == 1) {
  # Default assay is set automatically
  sc = sc[[1]]
  message("Your dataset contains 1 sample only. No merging/integrating required.")
} else {
  source(file.path(param$path_to_git,'scripts/pre-processing/combine_samples.R'))
}

× (Message)
Data values for all samples have been merged. This means that data values have been concatenated, not integrated.

# Only relevant if data loaded from rds, otherwise they would be scaled and dimensional reduced before
if (!("scale.data" %in% sc[["RNA"]][])) {
  # Scale (default)
  all_genes = rownames(sc)
  sc = suppressMessages(ScaleData(sc, features = all_genes))
} 
# Run pca with the variable genes
if (!("pca" %in% list_names(sc@reductions[]))) {
  # Run PCA for default normalization
  sc = suppressMessages(Seurat::RunPCA(sc, features=Seurat::VariableFeatures(object=sc), verbose=FALSE, npcs=min(50, ncol(sc))))
}


Scaling

Next, we estimate the impact of covariates, such as the percentage of mitochondrial gene expression and cell cycle phase, to evaluate reliability of the applied normalization and scaling method, and determine possibly variables to regress out and integration method.

What are unwanted sources of variations that could be regressed out?

There can be sources of uninteresting variation in the data that is often specific to the dataset. Hence, checking and removing unwanted variation is another important step in pre-processing so that those artifacts do not drive clustering in the downstream analysis.

Variables provided in vars.to.regress are individually regressed against each feature, and the resulting residuals are then scaled and centered. This step allows controling for factors that may bias clustering.

Count depth effect
Although normalization scales count data to render gene counts comparable between cells, a count depth effect often remains in the data as no scaling method can infer the expression values of genes that were not detected. This count depth effect can be both a biological and a technical artefact (due to poor sampling).
SCTransform regresses out variation due to the number of UMIs by default.

Cell cycle score
Cell cycle phase may be a source of significant variation in the data. It is essential to check, how much the gene expression profiles in the data reflect the cell cycle phases the single cells were in.

Mitochondrial gene expression
mitochondrial gene expression is another factor which can greatly influence clustering. However, if the differences in mitochondrial gene expression represent a biological phenomenon that may help to distinguish cell clusters, then we advise not regressing the mitochondrial expression.
How does removal of cell cycle effects affect the data?

Note that removing all signal associated to cell cycle can negatively impact downstream analysis. Cell cycle signals can be informative of the biology. For example, in differentiating processes, stem cells are quiescent and differentiated cells are proliferating (or vice versa), and removing all cell cycle effects can blur the distinction between these cells. Moreover, biological signals must be understood in context. Dependencies exist between diffferent cellular processes within the organism. Thus, correcting for one process may unintentionally mask the signal of another. Vice versa, due to a correlation of cell size and some transcriptomic effect attributed to the cell cycle (McDavid et al, 2016), correcting for cell size via normalization, also partially corrects for cell cycle effects in scRNA‐seq data.

An alternative approach is to remove the difference between G2M and S phase scores. This way, signals separating non-cycling and cycling cells will be maintained, while differences in cell cycle phase amongst proliferating cells (which are often uninteresting), will be removed. For a more detailed explanation, see the cell cycle vignette from Seurat (https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html).
How are cycle effects regressed out? Prior to calling the function CellCycleScoring the data need to be standard log normalized, since counts need to be comparable between cells. Once the data is normalized for sequencing depth, a score can be calculated for each cell based on its expression of G2M and S phase markers. Scoring is based on the strategy described in Tirosh et al. (2016), and human gene symbols are translated to gene symbols of the species of interest using biomaRt.
If specified, cell cycle effects can be removed during scaling. In the process, for each gene, Seurat models the relationship between gene expression and the S and G2M cell cycle scores. The scaled residuals of this model represent a ‘corrected’ expression matrix.
message(
  paste0("Here, ", 
               ifelse(is.null(param$vars_to_regress),"no additional sources of variance regressed out. ", 
                      paste0(param$vars_to_regress, " as sources of variance was/were regressed out. ")), 
               ifelse(param$cc_remove==FALSE,"No cell cycle effects removed for this report.",  
                      paste0(
                        ifelse(param$cc_remove_all==TRUE,"All cell cycle effects removed for this report.",  
                               "Difference in cell cycle state removed for this report.")
                      ))
         )
)

× (Message)
Here, no additional sources of variance regressed out. No cell cycle effects removed for this report.

sc
## An object of class Seurat 
## 6439 features across 1078 samples within 1 assay 
## Active assay: RNA (6439 features, 3000 variable features)
##  3 layers present: data, counts, scale.data
##  1 dimensional reduction calculated: pca


Dimensional reduction

A single-cell dataset of 20,000 genes and 5,000 cells has 20,000 dimensions. At this point of the analysis, we have already reduced the dimensionality of the dataset to 3,000 variable genes. The biological manifold however can be described by far fewer dimensions than the number of (variable) genes, since expression profiles of different genes are correlated if they are involved in the same biological process. Dimension reduction methods aim to find these dimensions. There are two general purposes for dimension reduction methods: to summarize a dataset, and to visualize a dataset.

We use Principal Component Analysis (PCA) to summarize a dataset, overcoming noise and reducing the data to its essential components. Later, we use Uniform Manifold Approximation and Projection (UMAP) to visualize the dataset, placing similar cells together in 2D space, see below.

PCA in a nutshell

Principal Component Analysis is a way to summarize a dataset and to reduce noise by averaging similar gene expression profiles. The information for correlated genes is compressed into single dimensions called principal components (PCs) and the analysis identifies those dimensions that capture the largest amount of variation. This process gives a more precise representation of the patterns inherent to complex and large datasets.

In a PCA, the first PC captures the greatest variance across the whole dataset. The next PC captures the greatest remaining amount of variance, and so on. This way, the top PCs are likely to represent the biological signal where multiple genes are affected by the same biological processes in a coordinated way. In contrast, random technical or biological noise that affects each gene independently are contained in later PCs. Downstream analyses can be restricted to the top PCs to focus on the most relevant biological signal and to reduce noise and unnecessary computational overhead.


Principal component analysis

Running a PCA on our object, we see how the variance can be explained.

p_list = Seurat::VizDimLoadings(sc, dims=1:12, reduction="pca", col=param$col, combine=FALSE, balanced=TRUE)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle(xlab = paste0("PC ",i))
p =  patchwork::wrap_plots(p_list, ncol=4) + patchwork::plot_annotation("Top gene loadings of the first two PCs") 
p


Dimensionality of the dataset

PCs include biological signal as well as noise, and we need to determine the number of PCs with which we include as much biological signal as possible and as little noise as possible. The following “Elbow plot” is designed to help us make an informed decision.

How do we determine the number of PCs for downstream analysis? The Elbow plot shows PCs ranked based on the percentage of variance they explain. The top PC captures the greatest variance across cells and each subsequent PC represents decreasing levels of variance. By visual inspection of the Elbow plot, we try to find the point at which we can explain most of the variance across cells. Commonly, the top 10 PCs are chosen. It may be helpful to repeat downstream analyses with a different number of PCs, although the results often do not differ dramatically. Note that it is recommended to rather choose too many than too few PCs.
# More approximate technique used to reduce computation time
p = Seurat::ElbowPlot(sc, ndims=min(30, ncol(sc))) + 
  geom_vline(xintercept=param$pc_n + .5, col="firebrick", lty=2) + 
  AddStyle(title="Elbow plot") 
p

# Cannot have more PCs than number of cells
param$pc_n = min(param$pc_n, ncol(sc))
message(paste0("For the current dataset ", param$pc_n," PCs were chosen."))

× (Message)
For the current dataset 8 PCs were chosen.


Batch effects

We visualize the sample distribution in low dimensional space to confirm that the chosen method for combining the samples was appropriate (see section Combine datasets). The following plots offer a low dimension representation of your data combined via dataset merging.

# Default UMAP
sc = suppressWarnings(Seurat::RunUMAP(sc, dims=1:param$pc_n, verbose=FALSE, umap.method="uwot", n.neighbors=param$umap_k))
### Score plots
# PCA score plots colored by sample
p1 = Seurat::DimPlot(sc, reduction="pca", group.by = "orig.ident", cols=param$col_samples, pt.size=param$pt_size, dims = c(1,2)) + 
  AddStyle(legend_position="none", title = "", xlab = "PC 1", ylab = "PC 2")
p2 = Seurat::DimPlot(sc, reduction="pca", group.by = "orig.ident", cols=param$col_samples, pt.size=param$pt_size, dims = c(3,4)) + 
  AddStyle(legend_position="none", title = "", xlab = "PC 3", ylab = "PC 4")
# Umap plot
p3 = Seurat::DimPlot(sc, reduction="umap", group.by="orig.ident", cols = param$col_samples, pt.size=param$pt_size) +
  AddStyle(title="UMAP, cells coloured by sample of origin", legend_position="bottom", xlab = "UMAP 1", ylab = "UMAP 2")

p = p3 + (p1 / p2) +
  plot_layout(widths = c(3, 1))
p

Clustering

The clustering method used by Seurat first constructs a graph structure, where nodes are cells and edges are drawn between cells with similar gene expression patterns. Technically speaking, Seurat first constructs a K-nearest neighbor (KNN) graph based on Euclidean distance in PCA space, and refines edge weights between cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To partition the graph into highly interconnected parts, cells are iteratively grouped together using the Leiden algorithm (Traag et al. (2019)).

Further explanation on clustering

At this point, we would like to define subpopulations of cells with similar gene expression profiles using unsupervised clustering. Clusters ultimately serve as approximations for biological objects like cell types or cell states.

During the first step of clustering, a K-nearest neighbor (KNN) graph is constructed. In simplified terms this means that cells are connected to their K nearest neighbors based on cell-to-cell expression similarity using the PCs chosen in the previous step. The higher the similarity is, the higher the edge weight becomes. During the second step, the graph is partitioned into highly interconnected communities, whereby each community represents a cluster of cells with similar expression profiles. The separation of the graph into clusters is dependent on the chosen resolution. For scRNA-seq datasets of around 3000 cells, it is recommended to use a resolution value between 0.4 and 1.2. This value can be set even higher for larger datasets. Note that the choice of PCs and cluster resolution is an arbitrary one. Therefore, it is highly recommended to evaluate clusters and re-run the workflow with adapted parameters if needed.

To get a first idea about how different cluster resolution values influence the clustering, we run and visualize the clustering multiple times.

tree_resolutions = seq(from = 0, to = 1.0, by = 0.2)
cluster_resolutions = sort(unique(c(param$cluster_resolution, param$cluster_resolution_test, tree_resolutions)))


# Construct phylogenetic tree relating the "average" cell from each sample
if (length(levels(sc$orig.ident)) > 1) {
  sc = BuildClusterTree(sc, features=rownames(sc), verbose=FALSE)
  Seurat::Misc(sc, "trees") = list(orig.ident = Seurat::Tool(sc, "BuildClusterTree"))
}

# The number of clusters can be optimized by tuning "resolution" -> based on feedback from the client whether or not clusters make sense
# Choose the number of PCs to use for clustering
sc = Seurat::FindNeighbors(sc, dims=1:param$pc_n, verbose=FALSE, k.param=param$cluster_k)

# Seurat vignette suggests resolution parameter between 0.4-1.2 for datasets of about 3k cells
# But we can run multiple resolutions if requested
# method: Method for running leiden (defaults to matrix which is fast for small datasets). Enable method = "igraph" to avoid casting large data to a dense matrix.
# algorithm: Algorithm for modularity optimization (1 = original Louvain algorithm; 2 = Louvain algorithm with multilevel refinement; 3 = SLM algorithm; 4 = Leiden algorithm). Leiden requires the leidenalg python.
sc = Seurat::FindClusters(sc, algorithm=4, method="igraph", resolution=cluster_resolutions, verbose=FALSE)

# Construct phylogenetic tree relating the "average" cell from each cluster for each clustering
# Also add colour lists for each clustering
for(r in cluster_resolutions) {
  n = paste0(DefaultAssay(sc), "_snn_res.", r)
  
  # Tree
  if (length(levels(sc[[n, drop=TRUE]])) > 1) {
    Seurat::Idents(sc) = n
    sc = suppressWarnings(BuildClusterTree(sc, dims=1:param$pc_n, verbose=FALSE))
    l = list(Seurat::Tool(sc, "BuildClusterTree"))
    names(l) = n
    suppressWarnings({Seurat::Misc(sc, "trees") = c(Seurat::Misc(sc, "trees"), l)})
  }
  
  col = GenerateColours(num_colours=length(levels(sc[[n, drop=TRUE]])), names=levels(sc[[n, drop=TRUE]]), 
                                     palette=param$col_palette_clusters, alphas=1)
  # Colours
  tree_color = col
  tree_color_list = list(col)
  names(tree_color_list) = n
  sc = ScAddLists(sc, lists=tree_color_list, lists_slot="colour_lists")
}

Clustering tree

Clustering is used to group similar samples. One problem when conducting cluster analysis is deciding for a number of clusters, usually controlled by a parameter such as a numerical value for the resolution or a k integer for k-means clustering. A clustering tree visualises the relationships between cluster at a range of resolutions and can aid in the desicion.

What is a clustering tree?

A clustering tree shows how cells move as the clustering resolution is increased. Each level of the tree represents a different resolution. Each cluster forms a node in the tree. The edges are a representation of the number of cells from a lower resolution that contribute to a sepcific cluster at the next higher resolution. Thus, the clustering tree indicates how clusters are related to each other, which clusters are distinct and do not change with the increasing resolution, and which are unstable. E.g. we can see how one cluster is subdevided into others. Hence, the clustering tree can help to identify which clusters might be candidates for merging. On the other hand, nodes with multiple incoming edges would indicate that we might have over-clustered the data.

Here, we use the clustertree package Zappia and Oshlack (2018). While the nodes in the first graph are colored by the cluster resolution, the second plot is colored by SC3 stability index, a measures of cluster stability across resolutions.

p1 = clustree::clustree(sc, prefix = paste0(DefaultAssay(sc),"_snn_res."), layout = "sugiyama") +
    scale_color_brewer(palette = "BrBG") +
    scale_edge_color_continuous(low = "black", high = "gold")

p2 = clustree::clustree(sc, prefix = paste0(DefaultAssay(sc),"_snn_res."), node_colour = "sc3_stability", layout = "sugiyama") 

p = p1 + p2
p

Overlay of clustering tree onto umap projection.

# Generate table with umap projection and resolutions
tbl_projection = as.data.frame(sc[["umap"]]@cell.embeddings[,1:2])
tbl_resolutions = as.data.frame(sc@meta.data %>% dplyr::select(contains("res.")))
tbl = cbind(tbl_projection, tbl_resolutions)

# Plot overlay of cluster tree with umap
overlay_list = clustree::clustree_overlay(tbl, prefix = paste0(DefaultAssay(sc),"_snn_res."), x_value = "umap_1", y_value = "umap_2", plot_sides = TRUE, use_colour = "points", alt_colour = "black")

p1 = overlay_list$overlay +
  scale_color_manual(values = tree_color) +
  scale_fill_brewer(palette = "BrBG", direction = -1) + 
  AddStyle(legend_position="right", xlab = "UMAP 1", ylab = "UMAP 2") + 
  patchwork::plot_annotation(title="Top view")
p1

p2 = overlay_list$x_side +
  scale_color_manual(values = tree_color) +
  scale_fill_brewer(palette = "BrBG", direction = -1) + 
  AddStyle(legend_position="none", xlab = "UMAP 1")
p3 = overlay_list$y_side +
  scale_color_manual(values = tree_color) +
  scale_fill_brewer(palette = "BrBG", direction = -1) + 
  AddStyle(legend_position="none", xlab = "UMAP 2")
p = p2 + p3 + 
  patchwork::plot_annotation(title="Side views")
p


Visualisation with UMAP

Here we plot the UMAP projection colored by clusters obtained using different resolutions.

# Define height of test clusters
height_per_row = 3
nr_cols = 3
nr_rows = ceiling(length(cluster_resolutions)/nr_cols)
fig_height_test_clusters = nr_rows * height_per_row
tree_resolutions = tree_resolutions[3:7]
cluster_resolutions = sort(unique(c(param$cluster_resolution, param$cluster_resolution_test, tree_resolutions)))

p_list = list()

for(r in cluster_resolutions) {
  r = as.character(r)
  n = paste0(DefaultAssay(sc), "_snn_res.", r)
  
  cluster_cells = table(sc[[n, drop=TRUE]])
  cluster_labels = paste0(names(cluster_cells)," (", cluster_cells,")")
  
  p_list[[r]] = Seurat::DimPlot(sc, reduction="umap", group.by=n, pt.size=param$pt_size, label=TRUE ) + 
    scale_color_manual(values=Seurat::Misc(sc, "colour_lists")[[n]], labels=cluster_labels) +
    AddStyle(title=r, legend_position="none", xlab = "UMAP 1", ylab = "UMAP 2") +
    FontSize(x.text = 10, y.text = 10, x.title = 13, y.title = 13, main = 15)
}
  
p = patchwork::wrap_plots(p_list, ncol=nr_cols) + patchwork::plot_annotation(title="UMAP, cells coloured by cluster identity for different resolution values")
p



Data export

# Add experiment details
Seurat::Misc(sc, "experiment") = list(project_id=param$project_id, date=Sys.Date(), species=gsub("_gene_ensembl", "", param$mart_dataset))

# Add parameter
Seurat::Misc(sc, "parameters") = param

# Add technical output (note: cannot use Misc function here)
sc@misc$technical = data.frame(ScrnaseqSessionInfo(param$path_to_git))
### Export seurat object
saveRDS(sc, file=file.path(param$path_out, "data", "sc.rds"))
### Convert data
################################################################################

### Export to Loupe Cell Browser
if (all(param$path_data$type == "10x")) { 
  
  # Export reductions (umap, pca, others)
  for(r in Seurat::Reductions(sc)) {
    write.table(Seurat::Embeddings(sc, reduction=r)[,1:2] %>% as.data.frame() %>% tibble::rownames_to_column(var="Barcode"),
                file=file.path(param$path_out, "data", paste0("Loupe_projection_", r, ".csv")), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
  }
  
  # Export categorical metadata
  loupe_meta = sc@meta.data
  idx_keep = sapply(1:ncol(loupe_meta), function(x) !is.numeric(loupe_meta[,x]))
  write.table(x=loupe_meta[, idx_keep] %>% tibble::rownames_to_column(var="barcode"), 
              file=file.path(param$path_out, "data", "Loupe_metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
}



### Export as andata object (h5ad file)
# Convert Seurat v5 single cell object to anndata object
#scCustomize::as.anndata(x = sc, file_path = file.path(param$path_out, "data"), file_name = "sc_anndata.h5ad", )

# scesy only worked for v3 and v4 objects
# convert to v3
#sc_v3 = sc
#sc_v3[["RNA"]] = as(sc_v3[["RNA"]], "Assay")
# Convert Seurat v3 single cell object to anndata object
#adata = sceasy::convertFormat(sc_v3, from="seurat", to="anndata", outFile=NULL, assay=DefaultAssay(sc))
# Write to h5ad file
#adata$write(file.path(param$path_out, "data", "sc.h5ad"), compression="gzip")



### Export count matrix
invisible(ExportSeuratAssayData(sc, 
                                dir=file.path(param$path_out, "data", "counts"), 
                                assays=param$assay_raw, 
                                slot="counts",
                                include_cell_metadata_cols=colnames(sc[[]]),
                                metadata_prefix=paste0(param$project_id, ".")))


### Export metadata
openxlsx::write.xlsx(x=sc[[]] %>% tibble::rownames_to_column(var="Barcode"), file=file.path(param$path_out, "data", "cell_metadata.xlsx"), rowNames=FALSE, colNames=TRUE)
Export seurat object

We export the assay data, cell metadata, clustering and visualization.

Result files:
* sc.rds: Seurat object for import into R
* cell_metadata.xlsx: Cell metadata in excel format
* counts folder: Contains raw count matrix of the entire dataset (sparse matrix)

Export as andata object

We export the assay data, cell metadata, clustering and visualization in andata format.

Result file:
* sc.h5ad: H5AD object

Export to Loupe Cell Browser

If all provided datasets are of type “10x”, we export the UMAP 2D visualization, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser (https://support.10xgenomics.com/single-cell-gene-expression/software/visualization/latest/what-is-loupe-cell-browser).

Result files are:
* Loupe_projection_(umap|pca|…).csv: Seurat UMAP/PCA/… projections for visualization
* Loupe_metadata.csv: Seurat cell meta data including clusters and cell cycle phases

Projections can be imported in Loupe via “Import Projection”, cell meta data via “Import Categories” and gene sets via “Import Lists”.

### Export annotation
################################################################################

# Export annotation as excel file
openxlsx::write.xlsx(x=data.frame(seurat_id=rownames(sc), ensembl_gene_id=seurat_rowname_to_ensembl[rownames(sc)], row.names=rownames(sc)) %>%
                       dplyr::inner_join(annot_ensembl, by="ensembl_gene_id"),
                     file=file.path(param$path_out, "data", "gene_annotation.xlsx"), 
                     rowNames=FALSE, colNames=TRUE)



Parameters and software versions

The following parameters were used to run the workflow.

Parameters
out = ScrnaseqParamsInfo(params=param)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"), full_width=FALSE, position="left")
Name Value
path_to_git /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis
path_to_basic_settings /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/basic_settings.R
path_to_advanced_settings /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/advanced_settings.R
scriptname scripts/pre-processing/pre-processing.Rmd
author kosankem
assay_raw RNA
downsample_cells_equally FALSE
cell_filter Sample1:nFeature_RNA=c(20, NA), nCount_RNA=c(200, 20000), percent_mt=c(0, 18); Sample2:nFeature_RNA=c(20, NA), nCount_RNA=c(200, 20000), percent_mt=c(0, 18)
feature_filter Sample1:min_counts=1, min_cells=5; Sample2:min_counts=1, min_cells=5
samples_min_cells 10
norm RNA
cc_remove FALSE
cc_remove_all FALSE
cc_rescore_after_merge TRUE
integrate_samples method:merge; dimensions:30; k_anchor:10; k_weight:100; reference:; integration_function:CCAIntegration
experimental_groups homogene
pc_n 8
cluster_k 20
umap_k 30
cluster_resolution_test 0.5, 0.7, 0.8
cluster_resolution 0.6
marker_padj 0.05
marker_log2FC 1
marker_pct 0.25
enrichr_site Enrichr
enrichr_padj 0.05
col_palette_samples ggsci::pal_igv
col_palette_clusters ggsci::pal_igv
col_palette_annotation ggsci:pal_ucscgb()
col #0086b3
col_bg #D3D3D3
pt_size 0.5
project_id Testdata
path_data name:Sample1, Sample2; type:10x, 10x; path:/mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/data/counts/sample1/, /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/data/counts/sample2/
species human
path_out /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata/pre-processing
debugging_mode default_debugging
mart_dataset hsapiens_gene_ensembl
mt ^MT-
enrichr_dbs GO_Biological_Process_2023, WikiPathway_2023_Human, Azimuth_2023, CellMarker_2024
annotation_dbs BlueprintEncodeData()
annot_version 98
annot_main ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=entrezgene_accession
mart_attributes ensembl_gene_id, external_gene_name, entrezgene_accession, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description
path_reference /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98
reference hsapiens_gene_ensembl.v98.annot.txt
file_annot /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98/hsapiens_gene_ensembl.v98.annot.txt
file_cc_genes /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98/cell_cycle_markers.xlsx
col_samples Sample1=#5050FFFF, Sample2=#CE3D32FF

This report was generated using the scripts/pre-processing/pre-processing.Rmd script. Software versions were collected at run time.

Software versions
out = ScrnaseqSessionInfo(param$path_to_git)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
Name Value
Run on: Mon Sep 02 04:08:06 PM 2024
ktrns/scrnaseq 33f82bf6981c9d6a7ebaf98b3304e6806278f021
Container NA
R R version 4.2.1 (2022-06-23)
Platform x86_64-pc-linux-gnu (64-bit)
Operating system Ubuntu 20.04.6 LTS
Host name hpc-rc11
Host OS #138-Ubuntu SMP Wed Jun 22 15:00:31 UTC 2022 (5.4.0-122-generic)
Packages abind1.4-5, ape5.8, backports1.4.1, beeswarm0.4.0, bibtex0.5.1, Biobase2.58.0, BiocGenerics0.44.0, bitops1.0-7, bslib0.6.1, cachem1.0.8, checkmate2.3.1, cli3.6.2, cluster2.1.3, clustree0.5.1, codetools0.2-18, colorspace2.1-0, cowplot1.1.3, curl5.2.1, data.table1.15.2, DelayedArray0.24.0, DelayedMatrixStats1.20.0, deldir2.0-4, digest0.6.34, dotCall641.1-1, dplyr1.1.4, ellipsis0.3.2, evaluate0.23, fansi1.0.6, farver2.1.1, fastDummies1.7.3, fastmap1.1.1, fitdistrplus1.1-11, future1.33.1, future.apply1.11.1, generics0.1.3, GenomeInfoDb1.34.9, GenomeInfoDbData1.2.9, GenomicRanges1.50.2, ggbeeswarm0.7.2, ggforce0.4.2, ggplot23.5.0, ggraph2.2.1, ggrastr1.0.2, ggrepel0.9.5, ggridges0.5.6, ggsci3.0.1, glmGamPoi1.10.2, globals0.16.2, glue1.7.0, goftest1.2-3, graphlayouts1.1.1, gridExtra2.3, gtable0.3.4, highr0.10, htmltools0.5.7, htmlwidgets1.6.4, httpuv1.6.14, httr1.4.7, ica1.0-3, igraph2.0.2, IRanges2.32.0, irlba2.3.5.1, jquerylib0.1.4, jsonlite1.8.8, kableExtra1.4.0, KernSmooth2.23-20, knitcitations1.0.12, knitr1.45, labeling0.4.3, later1.3.2, lattice0.20-45, lazyeval0.2.2, leiden0.4.3.1, lifecycle1.0.4, listenv0.9.1, lmtest0.9-40, lubridate1.9.3, magrittr2.0.3, MASS7.3-58, Matrix1.6-5, MatrixGenerics1.10.0, matrixStats1.1.0, memoise2.0.1, mime0.12, miniUI0.1.1.1, munsell0.5.0, nlme3.1-157, openxlsx4.2.5.2, parallelly1.37.1, patchwork1.2.0, pbapply1.7-2, pillar1.9.0, pkgconfig2.0.3, plotly4.10.4, plyr1.8.9, png0.1-8, polyclip1.10-6, progressr0.14.0, promises1.2.1, purrr1.0.2, R.methodsS31.8.2, R.oo1.26.0, R.utils2.12.3, R62.5.1, RANN2.6.1, RColorBrewer1.1-3, Rcpp1.0.12, RcppAnnoy0.0.22, RcppHNSW0.6.0, RCurl1.98-1.14, RefManageR1.4.0, renv0.16.0, reshape21.4.4, reticulate1.35.0, rlang1.1.3, rmarkdown2.25, ROCR1.0-11, RSpectra0.16-1, rstudioapi0.15.0, Rtsne0.17, S4Vectors0.36.2, sass0.4.8, scales1.3.0, scattermore1.2, sctransform0.4.1, sessioninfo1.2.2, Seurat5.0.2, SeuratObject5.0.1, shiny1.8.0, sp2.1-3, spam2.10-0, sparseMatrixStats1.10.0, spatstat.data3.0-4, spatstat.explore3.2-6, spatstat.geom3.2-9, spatstat.random3.2-3, spatstat.sparse3.0-3, spatstat.utils3.0-4, stringi1.8.3, stringr1.5.1, SummarizedExperiment1.28.0, survival3.2-13, svglite2.1.3, systemfonts1.0.5, tensor1.5, tibble3.2.1, tidygraph1.3.1, tidyr1.3.1, tidyselect1.2.0, timechange0.3.0, tweenr2.0.3, utf81.2.4, uwot0.1.16, vctrs0.6.5, vipor0.4.7, viridis0.6.5, viridisLite0.4.2, withr3.0.0, xfun0.41, xml21.3.6, xtable1.8-4, XVector0.38.0, yaml2.3.8, zip2.3.1, zlibbioc1.44.0, zoo1.8-12


Credits and References

This Workflow was developed as module of the sc_analysis workflow at the Research Core Unit Genomics, Hannover Medical School (Hannover, Germany). Seurat Vignettes were initially used as templates (Hao et al. (2023), Hao et al. (2021)).

# Writes knitcitations references to references.bib file.
knitcitations::write.bibtex(file=file.path(param$path_out, "references.bib"))
Hafemeister, C., Satija, R., 2019. Normalization and variance stabilization of single-cell RNA-seq data using regularized negative binomial regression. Genome Biology 20. https://doi.org/10.1186/s13059-019-1874-1
Hao, Y., Hao, S., Andersen-Nissen, E., III, W.M.M., Zheng, S., Butler, A., Lee, M.J., Wilk, A.J., Darby, C., Zagar, M., Hoffman, P., Stoeckius, M., Papalexi, E., Mimitou, E.P., Jain, J., Srivastava, A., Stuart, T., Fleming, L.B., Yeung, B., Rogers, A.J., McElrath, J.M., Blish, C.A., Gottardo, R., Smibert, P., Satija, R., 2021. Integrated analysis of multimodal single-cell data. Cell. https://doi.org/10.1016/j.cell.2021.04.048
Hao, Y., Stuart, T., Kowalski, M.H., Choudhary, S., Hoffman, P., Hartman, A., Srivastava, A., Molla, G., Madad, S., Fernandez-Granda, C., Satija, R., 2023. Dictionary learning for integrative, multimodal and scalable single-cell analysis. Nature Biotechnology. https://doi.org/10.1038/s41587-023-01767-y
Tirosh, I., Izar, B., Prakadan, S.M., Wadsworth, M.H., Treacy, D., Trombetta, J.J., Rotem, A., Rodman, C., Lian, C., Murphy, G., Fallahi-Sichani, M., Dutton-Regester, K., Lin, J.-R., Cohen, O., Shah, P., Lu, D., Genshaft, A.S., Hughes, T.K., Ziegler, C.G.K., Kazer, S.W., Gaillard, A., Kolb, K.E., Villani, A.-C., Johannessen, C.M., Andreev, A.Y., Allen, E.M.V., Bertagnolli, M., Sorger, P.K., Sullivan, R.J., Flaherty, K.T., Frederick, D.T., Jan’e-Valbuena, J., Yoon, C.H., Rozenblatt-Rosen, O., Shalek, A.K., Regev, A., Garraway, L.A., 2016. Dissecting the multicellular ecosystem of metastatic melanoma by single-cell RNA-seq. Science 352, 189–196. https://doi.org/10.1126/science.aad0501
Traag, V.A., Waltman, L., van Eck, N.J., 2019. From louvain to leiden: Guaranteeing well-connected communities. Scientific Reports 9. https://doi.org/10.1038/s41598-019-41695-z
Young, M.D., Behjati, S., 2020. SoupX removes ambient RNA contamination from droplet-based single-cell RNA sequencing data. GigaScience 9, giaa151. https://doi.org/10.1093/gigascience/giaa151
Zappia, L., Oshlack, A., 2018. Clustering trees: A visualization for evaluating clusterings at multiple resolutions. GigaScience 7. https://doi.org/10.1093/gigascience/giy083